Introduction

Data import & preprocessing

We are going to first import the merged counts tables that is outputted from the Nextflow pipeline. This table already has the raw counts summarised from the alignments and merged into a single matrix.

Import data

# Import CD4 counts
counts = readRDS('../data/ITO66-counts.rda')
head(counts)
##             Bcells_alone_1 Bcells_alone_2 Bcells_cntrl Bcells_Tcells_1
## KLHL17_001             189            160          188             107
## AGRN_001               505           1499         1379            1167
## B3GALT6_001            301            330          255             179
## DVL1_001               232            299          344             217
## DVL1_002               137             77          125              90
## DVL1_003               232            108          145              68
##             Bcells_Tcells_2 Bcells_Tcells_3 PlasmidDNA
## KLHL17_001              130              35          3
## AGRN_001               1038            1167       2852
## B3GALT6_001             208              85         53
## DVL1_001                299              44         38
## DVL1_002                123              18          6
## DVL1_003                 83              45         31
# Import CD4 metadata
metadata = readRDS('../data/ITO66-metadata.rda')
head(metadata)
##                          Sample  Group
## Bcells_alone_1   Bcells_alone_1  Bcell
## Bcells_alone_2   Bcells_alone_2  Bcell
## Bcells_Tcells_1 Bcells_Tcells_1 BTcell
## Bcells_Tcells_2 Bcells_Tcells_2 BTcell
## Bcells_Tcells_3 Bcells_Tcells_3 BTcell
## Bcells_cntrl       Bcells_cntrl  Bcell
# Import MultiQC results
report = readRDS('../data/ITO66-report.rda')
head(report)
## # A tibble: 6 × 47
##   Sample   raw_total_seque… filtered_sequen… sequences is_sorted `1st_fragments`
##   <chr>               <dbl>            <dbl>     <dbl>     <dbl>           <dbl>
## 1 Bcells_…           615906                0    615906         1          615906
## 2 Bcells_…           590307                0    590307         1          590307
## 3 Bcells_…           438168                0    438168         1          438168
## 4 Bcells_…           332876                0    332876         1          332876
## 5 Bcells_…           865567                0    865567         1          865567
## 6 Bcells_…           784799                0    784799         1          784799
## # … with 41 more variables: last_fragments <dbl>, reads_mapped <dbl>,
## #   reads_mapped_and_paired <dbl>, reads_unmapped <dbl>,
## #   reads_properly_paired <dbl>, reads_paired <dbl>, reads_duplicated <dbl>,
## #   reads_MQ0 <dbl>, reads_QC_failed <dbl>, `non-primary_alignments` <dbl>,
## #   total_length <dbl>, total_first_fragment_length <dbl>,
## #   total_last_fragment_length <dbl>, bases_mapped <dbl>,
## #   `bases_mapped_(cigar)` <dbl>, bases_trimmed <dbl>, …

Number of mapped reads

p1 = report %>% 
  select(Sample, reads_unmapped, reads_mapped) %>% 
  mutate(Sample = str_replace(Sample, '-bamstats', '')) %>% 
  pivot_longer(cols = c("reads_unmapped", "reads_mapped"), names_to = "feature", values_to = "reads") %>% 
    mutate(feature = if_else(feature == "reads_unmapped", "Unmapped", feature)) %>% 
  mutate(feature = if_else(feature == "reads_mapped", "Mapped", feature)) %>% 
  ggplot(aes(x = reorder(Sample, reads), y = reads, fill = feature)) +
  geom_col() +
  coord_flip() +
  theme_classic(base_size = 10) +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Set2") +
  xlab("") +
  ylab("Number of reads") +
  ggtitle("Library mapping rate")
p1

### Percent of mapped reads
p2 = report %>% 
  select(Sample, reads_unmapped_percent, reads_mapped_percent ) %>% 
  mutate(Sample = str_replace(Sample, '-bamstats', '')) %>% 
  pivot_longer(cols = c("reads_unmapped_percent", "reads_mapped_percent"), names_to = "feature", values_to = "reads") %>% 
  mutate(reads = reads / 100) %>% 
  mutate(feature = if_else(feature == "reads_unmapped_percent", "Unmapped", feature)) %>% 
  mutate(feature = if_else(feature == "reads_mapped_percent", "Mapped", feature)) %>% 
  ggplot(aes(x = reorder(Sample, reads), y = reads, fill = feature)) +
  geom_col() +
  coord_flip() +
  theme_classic(base_size = 10) +
  theme(legend.position = "top") +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(labels = scales::percent) +
  xlab("") +
  ylab("Percent of reads")
p2

# Plot together
p = (p1 | p2) + plot_layout(guides = "collect") & theme(legend.position = "bottom")
p

ggsave("figures/ITO66-mapping-qc.pdf", p, width = 8, height = 5)

Normalize data

# Match the sample ID's between samples
counts = counts[,match(metadata$Sample, colnames(counts))]

# Make DESeq2 object
ddsMat <- DESeqDataSetFromMatrix(countData = counts,
                                 colData = metadata,
                                 design = ~Group)

# Get normalize counts
ddsMat <- estimateSizeFactors(ddsMat)

# Filter flow counts
ddsMat.fil = ddsMat %>% 
  filter_low(percentile = 0.05)

# Get normalized counts 
counts.norm.fil <- (counts(ddsMat.fil, normalized = TRUE) + 0.5) %>% 
  as.data.frame() %>% 
  rownames_to_column("oligo_id")

# Create a long table 
counts.norm.fil.long = counts.norm.fil %>% 
  pivot_longer(cols = -oligo_id, names_to = "sampleId", values_to = "norm")

Technical replicates

Bcells_alone_1 appears to be too noisy for inclusion as as replicate

# Bcell
counts.norm.fil %>% 
  ggplot(., aes(x = log10(Bcells_alone_1), y = log10(Bcells_alone_2))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("Bcell #1") +
  ylab("Bcell #2") +
  theme_classic(base_size = 9) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) +
  stat_smooth(method = "lm", color = "red") 
## `geom_smooth()` using formula 'y ~ x'

counts.norm.fil %>% 
  ggplot(., aes(x = log10(Bcells_alone_1), y = log10(Bcells_cntrl))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("Bcell #1") +
  ylab("Bcell-ctrl #1") +
  theme_classic(base_size = 9) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) +
  stat_smooth(method = "lm", color = "red") 
## `geom_smooth()` using formula 'y ~ x'

counts.norm.fil %>% 
  ggplot(., aes(x = log10(Bcells_alone_2), y = log10(Bcells_cntrl))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("Bcell #2") +
  ylab("Bcell-ctrl #1") +
  theme_classic(base_size = 9) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) +
  stat_smooth(method = "lm", color = "red") 
## `geom_smooth()` using formula 'y ~ x'

# B & T-cells
counts.norm.fil %>% 
  ggplot(., aes(x = log10(Bcells_Tcells_1), y = log10(Bcells_Tcells_2))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("BT cells #1") +
  ylab("BT cells #2") +
  theme_classic(base_size = 9) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) +
  stat_smooth(method = "lm", color = "red") 
## `geom_smooth()` using formula 'y ~ x'

counts.norm.fil %>% 
  ggplot(., aes(x = log10(Bcells_Tcells_1), y = log10(Bcells_Tcells_3))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("BT cells #1") +
  ylab("BT cells #3") +
  theme_classic(base_size = 9) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) +
  stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'

counts.norm.fil %>% 
  ggplot(., aes(x = log10(Bcells_Tcells_2), y = log10(Bcells_Tcells_3))) +
  geom_point(colour = alpha("grey", 0.7)) +
  xlab("BT cells #2") +
  ylab("BT cells #3") +
  theme_classic(base_size = 9) +
  stat_cor(method = "pearson", aes(label = ..r.label..)) +
  stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'

Plot normalized histogram

counts.norm.fil.long %>% 
  ggplot(aes(x = log10(norm))) +
  geom_histogram(aes(y=..density..), binwidth=0.25, colour="black", fill="white") +
  geom_density(alpha = 0.2, fill = "#FF6666") +
  facet_wrap(sampleId~.) +
  theme_light(base_size = 11) +
  ylab("Density") +
  xlab("Normalized reads per oligo (log10)") 


Determine dropout oligo’s

# Keep samples of interest
ddsMat.fil = ddsMat[,which(colnames(assay(ddsMat)) %in% c("Bcells_alone_2", "Bcells_cntrl", "Bcells_Tcells_2", "Bcells_Tcells_1"))]

# Detect dropouts
dropout.res = compute_stats(ddsMat.fil, trt = "BTcell", ref = "Bcell", fitType = "parametric", lfcThreshold = 0.25, alpha = 0.05) %>% 
  mutate(log2FoldChange = round(log2FoldChange, 2)) %>% 
  mutate(log2FoldChangeShrunken = round(log2FoldChangeShrunken, 2)) 
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## 
## out of 1804 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0.25 (up)    : 0, 0%
## LFC < -0.25 (down) : 2, 0.11%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Significant oligos
dropout.res %>% 
  filter(padj < 0.05)
##      oligo_id baseMean log2FoldChange     lfcSE       stat       pvalue
## 1    ABT1_001 295.7037          -2.93 0.2468228 -10.861683 8.774198e-28
## 2 EEF1A1_001B 137.5964          -2.33 0.3282979  -6.342926 1.127208e-10
##           padj log2FoldChangeShrunken
## 1 1.582865e-24                  -1.44
## 2 1.016742e-07                  -0.82
# Browse
dropout.res %>% 
  select(oligo_id, log2FoldChange, padj) %>% 
  DT::datatable(extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
  ))
# Plot MA-plot
ito66.maplot = dropout.res %>% 
  ggplot(aes(x = log2(baseMean), y = log2FoldChange, label = oligo_id)) +
  geom_point(colour = alpha("grey", 0.7)) +
  geom_point(data = filter(dropout.res, oligo_id %in% c("ABT1_001")), size = 3, colour = "#efc383") +
  geom_point(data = filter(dropout.res, oligo_id %in% c("EEF1A1_001B")), size = 3, colour = "#efc383") +
  geom_text_repel(data = filter(dropout.res, oligo_id %in% c("ABT1_001", "EEF1A1_001B")), box.padding = 0.5, max.overlaps = Inf, size = 2.5) +
  theme_classic() +
  xlim(5, 10) +
  ylim(-3, 2) +
  xlab("Mean counts (log2)") +
  ylab("Fold change log2(B&Tcells/Bcells)")  +
  ggtitle("ITO66")
ggplotly(ito66.maplot)

Session info

sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.5.1               stringr_1.4.0              
##  [3] dplyr_1.0.9                 purrr_0.3.4                
##  [5] readr_2.1.2                 tidyr_1.2.0                
##  [7] tibble_3.1.7                tidyverse_1.3.1            
##  [9] ggpubr_0.4.0                plotly_4.10.0              
## [11] ggplotify_0.1.0             readxl_1.4.0               
## [13] DESeq2_1.34.0               SummarizedExperiment_1.24.0
## [15] Biobase_2.54.0              MatrixGenerics_1.6.0       
## [17] matrixStats_0.62.0          GenomicRanges_1.46.1       
## [19] GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [21] S4Vectors_0.32.4            BiocGenerics_0.40.0        
## [23] ggrepel_0.9.1               ggplot2_3.3.6              
## [25] ggsci_2.9                   patchwork_1.1.1            
## [27] knitr_1.39                 
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-3       ggsignif_0.6.3         ellipsis_0.3.2        
##   [4] XVector_0.34.0         fs_1.5.2               rstudioapi_0.13       
##   [7] farver_2.1.0           DT_0.25                bit64_4.0.5           
##  [10] AnnotationDbi_1.56.2   fansi_1.0.3            lubridate_1.8.0       
##  [13] xml2_1.3.3             splines_4.1.3          cachem_1.0.6          
##  [16] geneplotter_1.72.0     jsonlite_1.8.0         broom_0.8.0           
##  [19] annotate_1.72.0        dbplyr_2.1.1           png_0.1-7             
##  [22] BiocManager_1.30.18    compiler_4.1.3         httr_1.4.2            
##  [25] backports_1.4.1        assertthat_0.2.1       Matrix_1.4-0          
##  [28] fastmap_1.1.0          lazyeval_0.2.2         cli_3.3.0             
##  [31] htmltools_0.5.2        tools_4.1.3            gtable_0.3.0          
##  [34] glue_1.6.2             GenomeInfoDbData_1.2.7 Rcpp_1.0.8.3          
##  [37] carData_3.0-5          cellranger_1.1.0       jquerylib_0.1.4       
##  [40] vctrs_0.4.1            Biostrings_2.62.0      nlme_3.1-155          
##  [43] crosstalk_1.2.0        xfun_0.30              rvest_1.0.2           
##  [46] lifecycle_1.0.1        renv_0.15.4            rstatix_0.7.0         
##  [49] XML_3.99-0.10          zlibbioc_1.40.0        scales_1.2.0          
##  [52] hms_1.1.1              parallel_4.1.3         RColorBrewer_1.1-3    
##  [55] yaml_2.3.5             memoise_2.0.1          yulab.utils_0.0.5     
##  [58] sass_0.4.1             stringi_1.7.6          RSQLite_2.2.15        
##  [61] highr_0.9              genefilter_1.76.0      BiocParallel_1.28.3   
##  [64] rlang_1.0.2            pkgconfig_2.0.3        bitops_1.0-7          
##  [67] evaluate_0.15          lattice_0.20-45        labeling_0.4.2        
##  [70] htmlwidgets_1.5.4      bit_4.0.4              tidyselect_1.1.2      
##  [73] magrittr_2.0.3         R6_2.5.1               generics_0.1.2        
##  [76] DelayedArray_0.20.0    DBI_1.1.2              mgcv_1.8-39           
##  [79] pillar_1.7.0           haven_2.5.0            withr_2.5.0           
##  [82] survival_3.2-13        KEGGREST_1.34.0        abind_1.4-5           
##  [85] RCurl_1.98-1.6         modelr_0.1.8           crayon_1.5.1          
##  [88] car_3.0-13             utf8_1.2.2             tzdb_0.3.0            
##  [91] rmarkdown_2.14         locfit_1.5-9.6         grid_4.1.3            
##  [94] data.table_1.14.2      blob_1.2.3             reprex_2.0.1          
##  [97] digest_0.6.29          xtable_1.8-4           gridGraphics_0.5-1    
## [100] munsell_0.5.0          viridisLite_0.4.0      bslib_0.3.1